load(paste0(IO$output_clue, "users.Rdata"), verbose = TRUE)
## Loading objects:
## users
load(paste0(IO$output_clue, "cycles.Rdata"),verbose = TRUE)
## Loading objects:
## cycles
load(paste0(IO$output_clue, "tracking.Rdata"), verbose = TRUE)
## Loading objects:
## tracking
for(user_id in users$user_id[1:10]){
t = tracking[tracking$user_id == user_id,]
g = ggplot_user_tracking(t)
print(g)
g = ggplot_user_tracking(t, fertility_only = TRUE)
print(g)
}
## Warning: Removed 280 rows containing missing values (geom_vline).
## Warning: Removed 174 rows containing missing values (geom_point).
## Warning: Removed 15 rows containing missing values (geom_vline).
## Warning: Removed 78 rows containing missing values (geom_vline).
## Warning: Removed 16 rows containing missing values (geom_vline).
## Warning: Removed 928 rows containing missing values (geom_vline).
## Warning: Removed 36 rows containing missing values (geom_point).
## Warning: Removed 78 rows containing missing values (geom_vline).
## Warning: Removed 344 rows containing missing values (geom_vline).
## Warning: Removed 21 rows containing missing values (geom_point).
## Warning: Removed 30 rows containing missing values (geom_vline).
## Warning: Removed 2144 rows containing missing values (geom_vline).
## Warning: Removed 241 rows containing missing values (geom_point).
## Warning: Removed 33 rows containing missing values (geom_vline).
## Warning: Removed 112 rows containing missing values (geom_vline).
## Warning: Removed 104 rows containing missing values (geom_point).
## Warning: Removed 8 rows containing missing values (geom_vline).
## Warning: Removed 12600 rows containing missing values (geom_vline).
## Warning: Removed 5 rows containing missing values (geom_point).
## Warning: Removed 669 rows containing missing values (geom_vline).
## Warning: Removed 36 rows containing missing values (geom_vline).
## Warning: Removed 10 rows containing missing values (geom_vline).
## Warning: Removed 42 rows containing missing values (geom_vline).
## Warning: Removed 193 rows containing missing values (geom_point).
## Warning: Removed 6 rows containing missing values (geom_vline).
## Warning: Removed 712 rows containing missing values (geom_vline).
## Warning: Removed 177 rows containing missing values (geom_point).
## Warning: Removed 72 rows containing missing values (geom_vline).
load(paste0(IO$tmp_clue,"users_aggregate_data_prep.Rdata"), verbose = TRUE)
## Loading objects:
## users_aggregate
date_seq = seq(par$start_date, par$end_date, by = 1)
indiv_indicators = data.frame(user_id = rep(unique(tracking$user_id), each = length(date_seq)),
date = rep(date_seq, lu(tracking$user_id)), stringsAsFactors = FALSE)
m = match(indiv_indicators$user_id, users$user_id)
indiv_indicators$country = users$country[m]
indiv_indicators$age_cat = users$age_cat[m]
indiv_indicators$age = users$age[m]
indiv_indicators$bmi_cat = users$bmi_cat[m]
indiv_indicators$bmi = users$bmi[m]
indiv_indicators$active_use = FALSE
users_aggregate = users_aggregate[which(users_aggregate$user_id %in% unique(indiv_indicators$user_id)),]
for(user in unique(indiv_indicators$user_id)){
j = which(users_aggregate$user_id == user)
dates_active = seq(users_aggregate$first_obs_date[j], users_aggregate$last_obs_date[j],by =1)
k = which((indiv_indicators$user_id == user) & (indiv_indicators$date %in% dates_active))
indiv_indicators$active_use[k] = TRUE
}
indiv_indicators$day_id = paste0(indiv_indicators$user_id, "_", indiv_indicators$date)
# cycle_id
cycles_exp = as.data.frame(lapply(cycles, rep, cycles$cycle_length))
cycles_exp$cycleday = ave(rep(1,nrow(cycles_exp)), cycles_exp$cycle_id, FUN =cumsum)
cycles_exp$date = cycles_exp$cycle_start + (cycles_exp$cycleday - 1)
cycles_exp$day_id = paste0(cycles_exp$user_id, "_", cycles_exp$date)
# match indiv_indicators and cycles_sub_exp
m = match(indiv_indicators$day_id, cycles_exp$day_id)
indiv_indicators$cycle_nb = cycles_exp$cycle_nb[m]
indiv_indicators$cycle_id = cycles_exp$cycle_id[m]
indiv_indicators$cycle_length = cycles_exp$cycle_length[m]
indiv_indicators$cycleday = cycles_exp$cycleday[m]
indiv_indicators$cycleday_from_end = indiv_indicators$cycleday - indiv_indicators$cycle_length - 1
# time num
indiv_indicators$time_num = par$time_num[match(indiv_indicators$date, par$date_seq)]
save(indiv_indicators, file = paste0(IO$output_clue, "indiv_indicators.Rdata" ))
# unprotected sex
# bleeding
tracking_bleeding = tracking[tracking$category == "period",]
indiv_indicators$bleeding = as.numeric(
factor(tracking_bleeding$type[match(indiv_indicators$day_id, tracking_bleeding$day_id)],
levels = c("spotting","light","medium","heavy")))
indiv_indicators$bleeding[is.na(indiv_indicators$bleeding)] = 0
rm(tracking_bleeding)
# mucus
mucus_types = c("creamy","sticky","egg_white")
tracking_mucus = tracking[tracking$type %in% mucus_types,]
indiv_indicators$mucus = tracking_mucus$type[match(indiv_indicators$day_id, tracking_mucus$day_id)]
indiv_indicators$mucus_num = as.numeric(factor(indiv_indicators$mucus, levels = mucus_types))
indiv_indicators$mucus_num[is.na(indiv_indicators$mucus_num)] = 0
rm(tracking_mucus)
# ovulation day
indiv_indicators$ovu_day = FALSE
indiv_indicators$ovu_day[indiv_indicators$cycleday_from_end == -13] = TRUE
# daily fertility
fertility_pattern = c(0.05,0.1,0.15,0.2,0.3,0.2,0.1)
indiv_indicators$daily_fertility = 0
k = which(indiv_indicators$ovu_day)
i = numeric(0)
for(j in k){
i = c(i, (j-5):(j+1))
}
indiv_indicators$daily_fertility[i] = rep(fertility_pattern, length(k))
ggplot(indiv_indicators[indiv_indicators$user_id %in% unique(indiv_indicators$user_id)[1:10],],
aes(x = date, y = daily_fertility, col = user_id)) +
geom_line() + guides(col = FALSE)+
facet_grid(user_id ~ . )
save(indiv_indicators, file = paste0(IO$output_clue, "indiv_indicators.Rdata" ))
file.copy(from = paste0(IO$output_clue, "indiv_indicators.Rdata" ),
to = paste0(IO$tmp_clue, "indiv_indicators_with_fertility.Rdata" ))
## [1] TRUE
agg = ddply(indiv_indicators,
.(cycle_id),
.fun = summarize,
n_eggwhite = sum(mucus == "egg_white", na.rm = TRUE),
mucus_quality = max(mucus_num, na.rm = TRUE),
n_bleeding_after_day_5 = sum((bleeding >=3) & (cycleday >5), na.rm = TRUE),
n_heavy_bleeding = sum(bleeding ==4,na.rm = TRUE)
)
indiv_indicators = merge(indiv_indicators, agg, by = "cycle_id", all.x = TRUE)
indiv_indicators = indiv_indicators[order(indiv_indicators$o),]
# regularity
cycles = cycles[order(cycles$user_id, cycles$cycle_nb),]
cycles$cycle_id_next = paste0(cycles$user_id, "_", cycles$cycle_nb+1)
cycles$cycle_id_next2 = paste0(cycles$user_id, "_", cycles$cycle_nb+2)
cycles$cycle_length_next = cycles$cycle_length[match(cycles$cycle_id_next, cycles$cycle_id)]
cycles$cycle_length_next2 = cycles$cycle_length[match(cycles$cycle_id_next2, cycles$cycle_id)]
cycles$sd_cycle_length = apply(cycles[,grep("cycle_length",colnames(cycles))], 1, sd, na.rm = TRUE)
cycles$sd_cycle_length = replace_NAs_with_latest_value(cycles$sd_cycle_length)
indiv_indicators$sd_cycle_length = cycles$sd_cycle_length[match(indiv_indicators$cycle_id, cycles$cycle_id)]
save(indiv_indicators, file = paste0(IO$output_clue, "indiv_indicators.Rdata" ))
file.copy(from = paste0(IO$output_clue, "indiv_indicators.Rdata" ),
to = paste0(IO$tmp_clue, "indiv_indicators_with_cycle_fertility.Rdata" ))
## [1] TRUE
indiv_indicators$cycle_fertility = 1/6*(
sigmoid(indiv_indicators$mucus_quality,1.5,1) +
1-sigmoid(abs(indiv_indicators$n_eggwhite - 3 ),3,1)+
exp(-0.25*indiv_indicators$n_bleeding_after_day_5) +
1 - sigmoid(indiv_indicators$n_heavy_bleeding,5,1.5)+
1 - sigmoid(abs(28.5 - indiv_indicators$cycle_length),4,1)+
1 - sigmoid(indiv_indicators$sd_cycle_length, 10,0.5)
)
indiv_indicators$overall_fertility = 1/2*(
1 - sigmoid(abs(30 - indiv_indicators$age),15,0.3)+
1 - 0.5*sigmoid(abs(25 - indiv_indicators$bmi),6,1)
)
indiv_indicators$fertility = indiv_indicators$daily_fertility *
indiv_indicators$cycle_fertility * indiv_indicators$overall_fertility
save(indiv_indicators, file = paste0(IO$output_clue, "indiv_indicators.Rdata" ))
file.copy(from = paste0(IO$output_clue, "indiv_indicators.Rdata" ),
to = paste0(IO$tmp_clue, "indiv_indicators_with_overall_fertility.Rdata" ))
## [1] FALSE
ggplot(indiv_indicators[indiv_indicators$user_id %in% unique(indiv_indicators$user_id)[1:10],],
aes(x = date, y = fertility, col = user_id)) +
geom_line() + guides(col = FALSE)+
facet_grid(user_id ~ . )
## Warning: Removed 3830 rows containing missing values (geom_path).
par$agg_col
## [1] "country" "age_cat" "bmi_cat"
We build the population indicators by aggregating by the following columns {r par$agg_col}
pop_indicators = ddply(indiv_indicators, .variable = c("date","time_num",par$agg_col),
.fun = summarize,
n_users = sum(active_use)
)
pop_indicators = pop_indicators[order(pop_indicators$country,
pop_indicators$age_cat,
pop_indicators$bmi_cat,
pop_indicators$time_num),]
pop_indicators$o = 1:nrow(pop_indicators)
save(pop_indicators, file = paste0(IO$output_clue,"pop_indicators.Rdata"))
file.copy(from = paste0(IO$output_clue,"pop_indicators.Rdata"), to = paste0(IO$tmp_clue,"pop_indicators_empty.Rdata"), overwrite = TRUE)
## [1] TRUE
agg = ddply(tracking, .variable = c("date",par$agg_col),
.fun = summarize,
n_tot_sex = sum(type %in% c("withdrawal_sex","protected_sex","unprotected_sex")),
n_prot_sex = sum(type == "protected_sex"),
n_unprot_sex = sum(type == "unprotected_sex"),
n_withdrawal = sum(type == "withdrawal_sex")
)
ggplot(agg, aes(x = date, y = n_tot_sex, col = age_cat, linetype = bmi_cat)) + geom_line() + facet_grid(country ~., scale = "free_y") + xlim(c(as.Date("2014-12-31"),as.Date("2018-01-01")))
## Warning: Removed 348 rows containing missing values (geom_path).
Needs to be divided by # of active user at each date
agg = ddply(indiv_indicators,
.variable = c("date",par$agg_col),
.fun = summarize,
daily_fertility_sum = sum(daily_fertility))
pop_indicators = merge(pop_indicators, agg, all.x = TRUE)
pop_indicators = pop_indicators[order(pop_indicators$o),]
pop_indicators$daily_fertility = pop_indicators$daily_fertility_sum / pop_indicators$n_users
ggplot(pop_indicators, aes(x = date, y = daily_fertility, col = age_cat, linetype = bmi_cat)) + geom_line() + facet_grid(country ~., scale = "free_y")
## Warning: Removed 403 rows containing missing values (geom_path).
save(pop_indicators, file = paste0(IO$output_clue,"pop_indicators.Rdata"))
file.copy(from = paste0(IO$output_clue,"pop_indicators.Rdata"), to = paste0(IO$tmp_clue,"pop_indicators_with_fertility.Rdata"), overwrite = TRUE)
## [1] TRUE